import requests
from lxml import etree
import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from 爬取导师的每一页个人信息链接 import get_teacher_url
headers = {
            'accept-language': 'zh-CN,zh;q=0.9',
            'cache-control': 'max-age=0',
            'cookie': 'Hm_lvt_c3acb27768b401b6598a1ae2797371a4=1693548649,1693581545; '
                      'Hm_lpvt_c3acb27768b401b6598a1ae2797371a4=1693581548',
            'sec-fetch-dest': 'document',
            'sec-fetch-mode': 'navigate',
            'sec-fetch-site': 'same-origin',
            'sec-fetch-user': '?1',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/'
                          '537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 SLBrowser/7.0.0.6251 SLBChan/124'}

from 爬取姓名 import Name
s1=Name("https://sice.bupt.edu.cn/szdw1.htm")
xintong_name=s1.get_all_name('//div[@class="list_li clearfloat"]/ul/li/a/text()')
print(xintong_name)

xintong_geren_url=s1.get_all_name('//div[@class="list_li clearfloat"]/ul/li/a/@href')
xintong_geren_url_real=[]
for url in xintong_geren_url:
    if "http" in url:
        xintong_geren_url_real.append(url)
    else:
        xintong_geren_url_real.append('https://sice.bupt.edu.cn/'+url)
print (xintong_geren_url_real)

# 爬取个人主页的个人简历
# 爬取照片
xintong_email_real=[]
xintong_img_real=[]
xintong_gerenjianjie_real=[]
for url in xintong_geren_url_real:
    response = requests.get(url=url, headers=headers)
    response.encoding = 'utf-8'
    content = response.text
    # 使用xpath
    tree = etree.HTML(content)
    if "zh_CN" in url:
        gerenjianjie = tree.xpath('//div[@class="ct siglecrifbx"]//text()')
        xintong_gerenjianjie_real.append(gerenjianjie)
        # xintong_img=tree.xpath('//div[@class="img"]/span/img/@src')
        # email = tree.xpath('//div[@class="bs bs-1"]/p//text()')
        driver = webdriver.Chrome()
        driver.get(url)
        time.sleep(2)
        img = driver.find_element(By.XPATH, '//div[@class="img"]/span/img')
        src = img.get_attribute('src')
        print(src)
        driver.close()
        if "@" in email or "bupt" in email:
            xintong_email_real.append(email)
        else:
            email=[]
            xintong_email_real.append(email)
        for img in xintong_img:
            xintong_img_real.append(img)
    else:
        gerenjianjie = tree.xpath('//div[@class="v_news_content"]/p//text()')
        xintong_gerenjianjie_real.append(gerenjianjie)
        xintong_img = tree.xpath('//div[@class="v_news_content"]/p/img/@src')
        email=["待查"]
        xintong_email_real.append(email)
        for img in xintong_img:
            xintong_img_real.append('https://sice.bupt.edu.cn/' + img)

for gerenjianjie in xintong_gerenjianjie_real:
    print(gerenjianjie)
print(xintong_email_real)
print(xintong_img_real)
